## Data Cleaning
# load raw data files
data = read.csv("../data/filledDatabase10042019NUMonlyREDUCED.csv")
# clean data
data = clean_data(data)
# separate compound and group_cate from the predictors
compound = data$Compound
group_cat = data$GroupCat
group_cat_text = paste("Grp", group_cat)
data = select(data, -c("Compound","GroupCat"))
data_biplot = data
fviz_pca_var(
prcomp(data, scale = TRUE),
col.var = "contrib", # Color by contributions to the PC
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
Figure 1: Vectors of predictors in the space of PC1 and PC2
Here we choose first 17 PC for further analysis.
# print out the cumulative proportions for first k PC's
print_pca_importance(data, k = 20)
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 3.743427 3.089436 2.520277 2.41347 2.047305
## Proportion of Variance 0.237510 0.161770 0.107660 0.09873 0.071040
## Cumulative Proportion 0.237510 0.399290 0.506940 0.60567 0.676710
## PC6 PC7 PC8 PC9 PC10
## Standard deviation 1.904534 1.609677 1.36274 1.332563 1.120081
## Proportion of Variance 0.061480 0.043920 0.03148 0.030100 0.021260
## Cumulative Proportion 0.738190 0.782110 0.81358 0.843680 0.864940
## PC11 PC12 PC13 PC14 PC15
## Standard deviation 1.037753 1.019394 0.8502572 0.8243994 0.7964395
## Proportion of Variance 0.018250 0.017610 0.0122500 0.0115200 0.0107500
## Cumulative Proportion 0.883200 0.900810 0.9130600 0.9245800 0.9353300
## PC16 PC17 PC18 PC19 PC20
## Standard deviation 0.7065705 0.6779897 0.639666 0.5899257 0.5650318
## Proportion of Variance 0.0084600 0.0077900 0.006940 0.0059000 0.0054100
## Cumulative Proportion 0.9437900 0.9515900 0.958520 0.9644200 0.9698300
# transform data to a new space
data_pca = get_pc_space(data, k = 17) %>% scale()
set_color = c("#0071C3","#DE501A","#EEB020","#7E2E8E","#79AC2C","#4DBDF7","#A51331") %>%
rep(10)
data.frame(Compound = compound, GroupCat = group_cat_text, data_pca) %>%
ggplot(aes(x=PC1, y=PC2, color = GroupCat)) +
geom_point(aes(color = GroupCat), size = 2, alpha = 0.4) +
geom_text(aes(label=Compound, color=GroupCat), size = 3) +
scale_color_manual(values=set_color) +
scale_fill_manual(values=set_color) +
scale_shape_manual(values=1:11) +
theme_minimal()
Figure 2: Compounds in the space of the first two PC’s
library(RColorBrewer)
rownames(data_biplot) = make.names(compound, unique=TRUE)
fit <- princomp(data_biplot, cor=TRUE)
fviz_pca_biplot(fit, aesx = c(1,2),
# individual
label = "var", labelsize = 4,
geom = c("point","text"), fill.ind = group_cat_text, alpha.ind = 0.7,
pointsize = 2, pointshape = 21, palette = set_color[1:11],
# variable
col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel=TRUE) +
labs(fill = "Group Cat", color = "Contrib")
Figure 3: PCA biplot (compounds and predictor vectors in the space of the first two PC’s)
data.frame(Compound = compound, GroupCat = group_cat_text, data_pca) %>%
filter(GroupCat %in% c("Grp 3","Grp 5")) %>%
ggplot(aes(x=PC1, y=PC2, color = GroupCat)) +
# geom_point(aes(color = GroupCat), size = 4, alpha = 0.4) +
geom_text(aes(label=Compound, color=GroupCat), size = 3, alpha = 0.7) +
scale_color_manual(values=set_color) +
scale_fill_manual(values=set_color) +
scale_shape_manual(values=1:11) +
theme_minimal()
Figure 4: Cubic and Tilted in the space of the first two PC’s
data.frame(Compound = compound, GroupCat = group_cat_text, data_pca) %>%
filter(GroupCat %in% c("Grp 3","Grp 5")) %>%
droplevels() %>%
plot_ly(x = ~PC1, y = ~PC2, z = ~PC3, color = ~GroupCat, text = ~Compound, colors = set_color[1:2]) %>%
add_markers(marker=list(size = 7, opacity = 0.6)) %>%
layout(scene = list(xaxis = list(title = 'PC1'),
yaxis = list(title = 'PC2'),
zaxis = list(title = 'PC3')),
title = "Cubic and Titled in the space of PC1, PC2, PC3")
Figure 5: Cubic and Tilted in the space of the first two PC’s
fit = princomp(data_biplot, cor=TRUE)
new_color = c()
for(i in 1:11){
group_tag = levels(as.factor(group_cat_text))[i]
if(group_tag %in% c("Grp 3","Grp 5")){
new_color[i] = set_color[i+2]
}else{
new_color[i] = "white"
}
}
fviz_pca_biplot(fit, aesx = c(1,2),
# individual
label = "var", labelsize = 4,
geom = c("point","text"), fill.ind = group_cat_text, col.ind = "white", alpha.ind = 0.7,
pointsize = 2, pointshape = 21, palette = new_color,
# variable
col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel=TRUE) +
labs(fill = "Group Cat", color = "Contrib", title = "PCA biplot - Cubic and Tilted")
# data_3 = data[group_cat_text %in% c("Grp 3"),] %>% mutate(GroupCat = "Grp 3")
# data_5 = data[group_cat_text %in% c("Grp 5"),] %>% mutate(GroupCat = "Grp 5")
#
# data_3_5 = rbind(data_3, data_5)
# ggplot(melt(data_3_5[,c(49:59,60)], id.vars="GroupCat"), aes(value)) +
# geom_histogram(bins = 20, aes(fill=GroupCat), position = "dodge", color = "grey", size = 0.1) +
# facet_wrap(~variable, scales = 'free_x') +
# scale_fill_manual(values = set_color) +
# theme_bw()
kmean_results = get_kmeans_results(data_pca, k = 2)
get_kmeans_visual_2D(kmean_results)
get_kmeans_histogram(kmean_results)
kmean_results = get_kmeans_results(data_pca, k = 3)
get_kmeans_visual_2D(kmean_results)
get_kmeans_histogram(kmean_results)
kmean_results = get_kmeans_results(data_pca, k = 4)
get_kmeans_visual_2D(kmean_results)
get_kmeans_histogram(kmean_results)
kmean_results = get_kmeans_results(data_pca, k = 5)
get_kmeans_visual_2D(kmean_results)
get_kmeans_histogram(kmean_results)
kmean_results = get_kmeans_results(data_pca, k = 6)
get_kmeans_visual_2D(kmean_results)
get_kmeans_histogram(kmean_results)
kmean_results = get_kmeans_results(data_pca, k = 7)
get_kmeans_visual_2D(kmean_results)
get_kmeans_histogram(kmean_results)
kmean_results = get_kmeans_results(data_pca, k = 8)
get_kmeans_visual_2D(kmean_results)
get_kmeans_histogram(kmean_results)
kmean_results = get_kmeans_results(data_pca, k = 9)
get_kmeans_visual_2D(kmean_results)
get_kmeans_histogram(kmean_results)
kmean_results = get_kmeans_results(data_pca, k = 10)
get_kmeans_visual_2D(kmean_results)
get_kmeans_histogram(kmean_results)
kmean_results = get_kmeans_results(data_pca, k = 11)
get_kmeans_visual_2D(kmean_results)
get_kmeans_histogram(kmean_results)
kmean_results = get_kmeans_results(data_pca, k = 12)
get_kmeans_visual_2D(kmean_results)
get_kmeans_histogram(kmean_results)
kmean_results = get_kmeans_results(data_pca, k = 13)
get_kmeans_visual_2D(kmean_results)
get_kmeans_histogram(kmean_results)
kmean_results = get_kmeans_results(data_pca, k = 14)
get_kmeans_visual_2D(kmean_results)
get_kmeans_histogram(kmean_results)
kmean_results = get_kmeans_results(data_pca, k = 15)
get_kmeans_visual_2D(kmean_results)
get_kmeans_histogram(kmean_results)
kmean_results = get_kmeans_results(data_pca, k = 16)
get_kmeans_visual_2D(kmean_results)
get_kmeans_histogram(kmean_results)
kmean_results = get_kmeans_results(data_pca, k = 20)
get_kmeans_visual_2D(kmean_results)
get_kmeans_histogram(kmean_results)
kmean_results = get_kmeans_results(data_pca, k = 24)
get_kmeans_visual_2D(kmean_results)
get_kmeans_histogram(kmean_results)
kmean_results = get_kmeans_results(data_pca, k = 25)
get_kmeans_visual_2D(kmean_results)
get_kmeans_histogram(kmean_results)
### explore k visually
# choose the number of clusters to explore
# k = 4
#
# # build a model for kmeans
# kmean_results = get_kmeans_results(data_pca, k = k)
#
# # generate 2D visual for this k
# get_kmeans_visual_2D(kmean_results)
#
# # histogram matlab
# get_kmeans_histogram(kmean_results)
# generate 3D visual for this k
# get_kmeans_visual_3D(kmean_results)
### save results
# choose k to save
# k = c(2:25)
# save the result as csv
# save_kmeans_results(data, data_pca, k)
knitr::opts_chunk$set(
fig.width = 10,
fig.height = 5
)
Based on a criterion BIC, we choose a Gaussian mixture model with 3 components.
### mclust visuals
# build a model for mclust
model = get_mclust_model(data_pca)
# print summary table of the model
summary(model)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VVV (ellipsoidal, varying volume, shape, and orientation) model
## with 3 components:
##
## log-likelihood n df BIC ICL
## -4453.406 311 512 -11845.59 -11845.6
##
## Clustering table:
## 1 2 3
## 134 142 35
# generate 2D visual for mclust
get_mclust_visual_2D(model)
# histogram matlab
get_mclust_histogram(model)
# generate 3D visual for mclust
# get_mclust_visual_3D(model)
### save results
# save_mclust_results(data, data_pca, compound, group_cat)
x
# # check any missing values in chem group
# group_cat %>% check_missing()
#
# # dummy first k chem group
# group_cat_dummy = dummy_group_cat(group_cat, k = 5)
#
# # normalize data_pca
# data_pca_scaled = scale(data_pca) %>% as.data.frame()
#
# # compute cv classification rate
# get_nn_cv_rate(group_cat_dummy, data_pca_scaled) %>% mean()
# build a model for neural nets
# model = get_nn_model(group_cat_dummy, data_pca_scaled)
# plot neural nets
# library(devtools)
# source_url('https://gist.githubusercontent.com/fawda123/7471137/raw/466c1474d0a505ff044412703516c34f1a4684a5/nnet_plot_update.r')
# plot.nnet(model)
# save nn results
# save_nn_results(model, compound, group_cat, data, names(group_cat_dummy))
# save model weights (layers = 20,10,5)
# data.frame(weight = model$result.matrix) %>%
# rownames_to_column() %>%
# filter(between(row_number(), 4, n())) %>%
# separate(rowname, c("node_in","node_out"), sep = ".to.") %>%
# write.csv("../result/nn weights.csv", row.names = FALSE)
### explore for k
# k = c(3:11)
#
# dummy = list()
# for(i in 1:length(k)){
# dummy[[i]] = repeat_B(k = k[i])
# }
#
# data.frame(dummy) %>%
# set_colnames(paste("k=", k, sep="")) %>%
# reshape2::melt() %>%
# ggplot(aes(x = value)) +
# geom_histogram(bins = 30) +
# labs(x="Classification Rate", y="Frequency") +
# theme_bw() + coord_flip() +
# facet_grid(~variable)
# data.frame(dummy) %>% set_colnames(paste("k=", k, sep="")) %>%
# write.csv("../result/nn class rate.csv", row.names = FALSE)